package belloCollector.crawler;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.json.JSONObject;
import org.jsoup.select.Elements;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;

public class SougouWechatArticleCrawler extends BreadthCrawler {

	public SougouWechatArticleCrawler(String crawlPath, boolean autoParse) {
		super(crawlPath, autoParse);
	}

	static List<Map<String, String>> articles = new ArrayList<Map<String, String>>();
	// 存放文章map集合，key为url
	static Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
	// 存放文章的信息map
	Map<String, String> articleMap;

	@Override
	public void visit(Page page, CrawlDatums next) {
		String type = page.meta("type");
		// System.out.println(type);
		// 如果是列表页，抽取内容链接，放入后续任务中
		if (type.equals("articleList")) {
			next.add(page.getLinks(".txt-box h3>a")).meta("type", "content").meta("depth", "2");
		} else if (type.equals("content")) {
			String url = page.url();
			String commentUrl = url.replace("/s?", "/mp/getcomment?");
			// 评论地址添加到后续任务中
			next.add(new CrawlDatum(commentUrl).meta("type", "comment").meta("refer", url).meta("depth", "3"));
			String title = page.doc().title();
			String content = page.select("#js_content").first().text();
			Elements copyrights = page.select("#copyright_logo");
			String copyright = copyrights.size() == 0 ? "" : copyrights.first().text();
			String postDate = page.select("#post-date").first().text();
			String wx_nickname = page.select("#js_profile_qrcode .profile_nickname").first().text();
			String wx_name = page.select("#js_profile_qrcode .profile_meta_value").first().text();
			// 功能介绍
			String funcIntro = page.select("#js_profile_qrcode .profile_meta_value").eq(1).text();
			articleMap = new HashMap<String, String>();
			articleMap.put("url", url);
			articleMap.put("commentUrl", commentUrl);
			articleMap.put("title", title);
			articleMap.put("content", content);
			articleMap.put("copyright", copyright);
			articleMap.put("postDate", postDate);
			articleMap.put("wx_nickname", wx_nickname);
			articleMap.put("wx_name", wx_name);
			articleMap.put("funcIntro", funcIntro);
			map.put(url, articleMap);
		} else if (type.equals("comment")) {
			String commentHtml = page.html();
			if (StringUtils.isNotBlank(commentHtml)) {
				JSONObject json = new JSONObject(commentHtml);
				String articleComment = String.valueOf(json.get("comment"));
				String readNum = String.valueOf(json.get("read_num"));
				String likeNum = String.valueOf(json.get("like_num"));
				String referUrl = page.meta("refer");
				Map<String, String> article = map.get(referUrl);
				article.put("articleComment", articleComment);
				article.put("readNum", readNum);
				article.put("likeNum", likeNum);
				articles.add(article);
			}
		}
	}

	public static void main(String[] args) throws Exception {
		// 获取种子url集合
		CrawlDatums datums = new CrawlDatums();
		CrawlDatum datum = null;
		// String url = "http://weixin.sogou.com/pcindex/pc/pc_0/pc_0.html";
		String url_0 = "http://weixin.sogou.com/pcindex/pc/pc_%s";
		String seedUrl = "";
		for (int i = 0; i < 1; i++) {//最大19
			String url = String.format(url_0, i).concat("/%s.html");
			for (int m = 0; m < 16; m++) {
				if (m == 0) {
					seedUrl = String.format(url, "pc_" + i);
				} else {
					seedUrl = String.format(url, m);
				}
				datum = new CrawlDatum(seedUrl).meta("type", "articleList").meta("depth", "1");
				datums.add(datum);
			}
		}
		SougouWechatArticleCrawler articleCrawler = new SougouWechatArticleCrawler("搜狗文章", true);
		articleCrawler.addSeed(datums.get(0));
		// articleCrawler.setTopN(2);
		articleCrawler.setThreads(10);
		articleCrawler.start(3);
		for (Map<String, String> articleMap2 : articles) {
			String line = String.format(
					"标题：%s,\n地址：%s,\n是否原创：%s,\n发表时间：%s,\n微信昵称：%s,\n微信号：%s,\n功能介绍：%s,\n阅读数：%s,\n点赞数：%s",
					articleMap2.get("title"), articleMap2.get("url"), articleMap2.get("copyright"),
					articleMap2.get("postDate"), articleMap2.get("wx_nickname"), articleMap2.get("wx_name"),
					articleMap2.get("funcIntro"), articleMap2.get("readNum"), articleMap2.get("likeNum"));
			System.out.println(line + "\n--------------------------------");
		}
	}
}
